cd
cd ..\..

local fname build_2_clean_data_elec
capture log close
local c_date = c(current_date)
local c_time = c(current_time)
local c_time_date = "`c_date'"+"_"+"`c_time'"
local c_time_date = subinstr("`c_time_date '", ":", "_", .)
local c_time_date = subinstr("`c_time_date '", " ", "_", .)
log using "output\logs\`fname'_`c_time_date'.txt", replace text

// CONFIRM EXISTENCE OF DIRECTORY FOR STORING TEMP FILES USED IN MERGING IN 2_clean_data_elec.do
cd "intermediate_data\data_cleaned_for_merge" // verify directory exists, will throw error if not
cd ..\.. // jump back to project directory

// CLEAN ELECTRICITY DATA

******************************************
*            Consumption Data            *
******************************************

use .\input\consumption_elec_2014-2018.dta, clear
sort ky_prem_no ky_ba ky_spt rate rate_description dt_rdg_from dt_rdg_to dc_rdg_srce tot_kwh elecratetype ad_serv_zip

* remove duplicate observations that differ only by missing zip code data (coded as zero)
duplicates tag ky_prem_no ky_ba ky_spt rate rate_description dt_rdg_from dt_rdg_to dc_rdg_srce tot_kwh elecratetype, gen(dup)
drop if dup==1 & ad_serv_zip==0
drop dup

* Define unique observations 
gen date_start=date(dt_rdg_from,"MDY")
format date_start %td
gen date_end=date(dt_rdg_to,"MDY")
format date_end %td
replace date_start=date_start+1 

gen days_between=date_end-date_start
drop if days_between>35 | days_between<24 
* gen daily consumption 
gen day_kwh=tot_kwh/days_between 

* gen start and end month and year
gen year_start=year(date_start)
gen month_start=month(date_start)
gen year_end=year(date_end)
gen month_end=month(date_end)

// Generate the unique month for each account, reading period----------------
// define end of month date for date_start
gen end_date_1 = .
foreach n of numlist 2013/2018 {
replace end_date_1=mdy(01,31,`n') if month_start==1 &year_start==`n'
replace end_date_1=mdy(02,28,`n') if month_start==2 &year_start==`n'
replace end_date_1=mdy(03,31,`n') if month_start==3 &year_start==`n'
replace end_date_1=mdy(04,30,`n') if month_start==4 &year_start==`n'
replace end_date_1=mdy(05,31,`n') if month_start==5 &year_start==`n'
replace end_date_1=mdy(06,30,`n') if month_start==6 &year_start==`n'
replace end_date_1=mdy(07,31,`n') if month_start==7 &year_start==`n'
replace end_date_1=mdy(08,31,`n') if month_start==8 &year_start==`n'
replace end_date_1=mdy(09,30,`n') if month_start==9 &year_start==`n'
replace end_date_1=mdy(10,31,`n') if month_start==10 &year_start==`n'
replace end_date_1=mdy(11,30,`n') if month_start==11 &year_start==`n'
replace end_date_1=mdy(12,31,`n') if month_start==12 &year_start==`n'
}
replace end_date_1=mdy(02,29,2016) if month_start==2 &year_start==2016 
format end_date_1 %td

// define start of month date for date_end
gen start_date_1 = .
foreach n of numlist 2013/2018 {
replace start_date_1=mdy(01,1,`n') if month_end==1 &year_end==`n'
replace start_date_1=mdy(02,1,`n') if month_end==2 &year_end==`n'
replace start_date_1=mdy(03,1,`n') if month_end==3 &year_end==`n'
replace start_date_1=mdy(04,1,`n') if month_end==4 &year_end==`n'
replace start_date_1=mdy(05,1,`n') if month_end==5 &year_end==`n'
replace start_date_1=mdy(06,1,`n') if month_end==6 &year_end==`n'
replace start_date_1=mdy(07,1,`n') if month_end==7 &year_end==`n'
replace start_date_1=mdy(08,1,`n') if month_end==8 &year_end==`n'
replace start_date_1=mdy(09,1,`n') if month_end==9 &year_end==`n'
replace start_date_1=mdy(10,1,`n') if month_end==10 &year_end==`n'
replace start_date_1=mdy(11,1,`n') if month_end==11 &year_end==`n'
replace start_date_1=mdy(12,1,`n') if month_end==12 &year_end==`n'
}
format start_date_1 %td

//calculate days in start month and end month
gen days_start=end_date_1-date_start
gen days_end=date_end-start_date_1

// generate month according to more days in which month 
gen month=month_start if days_start>days_end 
replace month=month_end if days_start<=days_end

// fix month when start month = end month +1 (e.g. Dec 30 2013- Feb 1 2014 => Month 13 "2013") 
//replace month=month_start+1 if days_start==days_end & days_start==0
replace month=month_start+1 if month_end-month_start==2 | month_end-month_start==-10

// generate year 
gen year=year_start
replace year = year+1 if (month==1 & year_start!=year_end) 
replace year = year+1 if month==13 
replace month=1 if month==13 

// Generate unique month 
gen uni_month = month
replace uni_month = uni_month+12 if year==2014
replace uni_month = uni_month+24 if year==2015
replace uni_month = uni_month+36 if year==2016
replace uni_month = uni_month+48 if year==2017
replace uni_month = uni_month+60 if year==2018
rename month month_in_year
rename uni_month month

// check duplicative months
bysort ky_ba month: gen dup_2=_n
tab dup_2

// Drop duplicative month with zero kwh
bysort ky_ba month: gen dup_3=_N
drop if tot_kwh==0 & dup_3!=1

capture drop dup_2
bysort ky_ba month: gen dup_2=_n
tab dup_2

// Mark account as closed 
gen closed=0
replace closed=1 if strpos(dc_rdg_srce,"Final")>0
replace closed=. if closed==0
bysort ky_ba (month): replace closed=closed[_n-1] if closed==.
bysort ky_ba (month): replace closed=closed[_N] if closed==.
replace closed=0 if closed==. 

// drop final reading observations 
drop if dup_3!=1 & dc_rdg_srce=="Estimated Final R"
drop if dup_3!=1 & dc_rdg_srce=="Estimated Final Reading"
drop if dup_3!=1 & dc_rdg_srce=="Final Reading"
drop if dup_3!=1 & dc_rdg_srce=="Manual Final Read"
drop if dup_3!=1 & dc_rdg_srce=="Manual Final Reading"

capture drop dup_2
bysort ky_ba month: gen dup_2=_n
tab dup_2

drop if year==2013 
drop if month==65  

**************************************
count // # of observations left

* Rename month [1,60]
replace month=month-12
* now 2014 Jan is labeled as month==1 and 2015 Jan is labeled as month==13 

// Find duplicate of month in each ky_ba
capture drop dup
bysort ky_ba month: gen dup=_n
tab dup

// drop observations with same reading period but one reading is the duplicate 
* a few accounts have repeated monthly readings
capture drop dup_obs
bysort ky_ba dt_rdg_from dt_rdg_to: gen dup_obs=_N
drop if dup_obs==2
* Check
assert dup_obs==1 // only 1 (unique observation) 

capture drop dup_obs_2 
bysort ky_ba dt_rdg_from dt_rdg_to: gen dup_obs_2=_n
*checker
assert dup_obs_2==1

capture drop dup
bysort ky_ba month: gen dup=_n
tab dup


// Deal with dup==2
//-------------------

// Correct remaining problem with month: the observations with equal number of 
// days in two month, the month before or after it results in duplicates
rename month_end month_end_in_year
gen month_end = month_end_in_year
replace month_end = month_end+12 if year_end==2014
replace month_end = month_end+24 if year_end==2015
replace month_end = month_end+36 if year_end==2016
replace month_end = month_end+48 if year_end==2017
replace month_end = month_end+60 if year_end==2018
replace month_end=month_end-12
tab month_end
* Expect [1,53] gets [1,53]

* LOOP: push dups to adapt month_end
local i=1
foreach i of numlist 1/53 {
	bysort ky_ba month: gen dup_N_x=_N
	replace month=month_end if dup_N_x==2 
	drop dup_N_x
}
capture drop dup_2
bysort ky_ba month: gen dup_2=_n
tab dup_2

// correct the year for the originally Dec month being pushed to Jan of next year 
replace year=2015 if month==13 & year!=2015 
replace year=2016 if month==25 & year!=2016
replace year=2017 if month==37 & year!=2017
replace year=2018 if month==49 & year!=2018
* checker 
assert year==2014 if month>=1 & month<=12
assert year==2015 if month>=13 & month<=24
assert year==2016 if month>=25 & month<=36
assert year==2017 if month>=37 & month<=48
assert year==2018 if month>=49 & month<=53
* expect 0 get 0 

// manually deal with the remaining duplicates
* omitted this part of code due to confidential billing account numbers. 
* check removed all duplicates
assert dup_2==1

// remove unecessary variables 
drop date_start date_end year_start month_start year_end month_end_in_year ///
end_date_1 start_date_1 days_start days_end dup_3 dup month_end dup_N_2 dup_2 

// the month_in_year variable should also be corrected after duplicates push forward 
* loop above. this is to make sure month_in_year matches month 

// correct year again 
* repeat this block of code after manual correction
replace year=2015 if month==13 & year!=2015 
replace year=2016 if month==25 & year!=2016
replace year=2017 if month==37 & year!=2017
replace year=2018 if month==49 & year!=2018
* checker 
assert year==2014 if month>=1 & month<=12
assert year==2015 if month>=13 & month<=24
assert year==2016 if month>=25 & month<=36
assert year==2017 if month>=37 & month<=48
assert year==2018 if month>=49 & month<=53
* expect 0 get 0 

// correct the year for the originally Dec month being pushed to Jan of next year
replace month_in_year=month-48 if year==2018
replace month_in_year=month-36 if year==2017
replace month_in_year=month-24 if year==2016
replace month_in_year=month-12 if year==2015
replace month_in_year=month if year==2014

tab month_in_year
* check month_in_year [1,12]
assert month_in_year<=12 & month_in_year>=1

// label renter if number of ba>=2 for the same prem_no. 
// 40% generally matches with info from external sources. 
bysort ky_prem_no ky_ba: gen temp=_n==1
bysort ky_prem_no: egen n_acct=sum(temp)
gen renter=1 if n_acct>=2
replace renter=0 if renter==.
tab renter

//income 
count if rate==6 & rate_description!="Elec A-60 Resi Low Income-Std Ofr"
count if rate==1 & rate_description!="Elec A-16 Residential-Std Ofr"
gen low_income=0
replace low_income=1 if (rate==6 | rate==905)
tab low_income

// Deal with outliers
summarize tot_kwh,detail
gen outlier=0 if inrange(tot_kwh, r(p1),r(p99))
replace outlier=1 if outlier==.
tab outlier, summarize(tot_kwh) 
drop if outlier==1

save .\intermediate_data\data_cleaned_for_merge\consumption_elec_2014-2018_for_merge.dta, replace 
// save for generating separte dataset for spillover analysis


********************************************************************************
*                    Merge Consumption and Opower Data                         *
********************************************************************************
// merge in opower data 
merge m:1 ky_ba using .\intermediate_data\data_cleaned_for_merge\opower_for_merge_elec.dta, keep(match master) nogen

* Non-matched are Opower non-participants 
replace group="Non-part" if group==""

// checker 
assert opower_month==. if group=="Non-part"
assert opower_month!=. if group!="Non-part"

// name consistent with elec analysis
rename group group_opower_paper 
rename elec opower_elec_paper
rename gas opower_gas_paper
rename opower_month opower_start_month

// generate opower wave indicator 
gen opower_wave=1 if opower_start_month==-3     // march 2013
replace opower_wave=2 if opower_start_month==-4 // april 2013
replace opower_wave=3 if opower_start_month==3  // march 2014
replace opower_wave=4 if opower_start_month==8  // etc.
replace opower_wave=5 if opower_start_month==22
replace opower_wave=6 if opower_start_month==32
replace opower_wave=7 if opower_start_month==39
replace opower_wave=8 if opower_start_month==50
rename opower_wave opower_paper_wave

********************************************************************************
*           Merge Consumption, Opower, and Demographics Data                   *
********************************************************************************
// merge in demographic data 
merge m:1 ky_ba ad_serv_zip using .\intermediate_data\data_cleaned_for_merge\demo_for_merge_ele.dta, keep(match master) nogen


********************************************************************************
*                     Create other useful variables                            *
********************************************************************************

// generate ever-participated program indicators (known participants only)
* opower
gen opower=1 if group_opower=="treatment"
replace opower=0 if group_opower=="control"
order opower, after(group_opower_paper)
* checker
assert opower!=. if group_opower_paper!="Non-part"
assert opower==. if group_opower_paper=="Non-part"

// generate month_since participated in the program
replace opower_start_month=-10 if opower_start_month==-3 // march 2013
replace opower_start_month=-9 if opower_start_month==-4  // april 2013

local program "opower"
foreach var in `program'{
	bysort ky_ba (month): gen month_since_`var' = month - `var'_start_month
	order month_since_`var', after(`var'_start_month)
}

// generate post program 
local program "opower"
foreach var in `program'{
	bysort ky_ba (month): gen post_`var'=1 if month_since_`var'>=0 & month_since_`var'!=. 
	replace post_`var'=0 if month_since_`var'<0 & month_since_`var'!=. 
	order post_`var',after(month_since_`var')
}

* checker 
assert post_opower!=1 if month_since_opower<0
assert post_ew!=1 if month_since_ew<0
assert post_amp!=1 if month_since_amp<0

// generate program part and post interaction term 
local program "opower"
foreach var in `program'{
	gen `var'_dt = `var' * post_`var'
	order `var'_dt, after(post_`var')
}

// drop wave 4 pre-period because wave 4 is for new movers only
count if opower_paper_wave==4 & month_since_opower<0
drop if opower_paper_wave==4 & month_since_opower<0

// feedback
* Calculate the mean & median for each zip code 
bysort income ad_serv_zip month: egen zip_mean=mean(tot_kwh)
bysort income ad_serv_zip month: egen zip_sd=sd(tot_kwh)
bysort income ad_serv_zip month: egen zip_median=median(tot_kwh)

gen median_30_above=zip_median*1.3
gen median_30_below=zip_median*0.7

* Generate feedback type 
gen feedback="normal"
replace feedback="negative" if tot_kwh>median_30_above
replace feedback="positive" if tot_kwh<median_30_below
encode feedback, gen(feedback_num)
tab feedback,generate(feedback_)
* 1 is negative, 2 is normal, 3 is positive
gen feedback1Xopower_dt=feedback_1*opower_dt
gen feedback3Xopower_dt=feedback_3*opower_dt

// generate pre-treatment average consumption for different periods
foreach mon in 12 24 {
	gen pre_temp = tot_kwh if inrange(month_since_opower, -`mon', -1)
	bysort ky_ba: egen pre_avg_`mon' = mean(pre_temp) // compute average by account
	lab var pre_avg_`mon'	"`mon'-month pre-treatment consumption (kWh)"
	drop pre_temp
}

// generate income categories 
gen inc=0 
replace inc=1 if income_temp=="$0 to $5,000" | income_temp=="$5,001 to $10,000" | income_temp=="$10,001 to $15,000"| ///
income_temp=="$15,001 to $20,000" | income_temp=="$20,001 to $25,000"
replace inc=2 if income_temp=="$25,001 to $30,000" | income_temp=="$30,001 to $35,000" | income_temp=="$35,001 to $40,000"| ///
income_temp=="$40,001 to $45,000" | income_temp=="$45,001 to $50,000"
replace inc=3 if income_temp=="$50,001 to $55,000" | income_temp=="$55,001 to $60,000" | income_temp=="$60,001 to $65,000"| ///
income_temp=="$65,001 to $70,000" | income_temp=="$70,001 to $75,000"
replace inc=4 if income_temp=="$75,001 to $80,000" | income_temp=="$80,001 to $85,000" | income_temp=="$85,001 to $90,000"| ///
income_temp=="$90,001 to $95,000" | income_temp=="$95,001 to $100,000"
replace inc=5 if income_temp=="$100,001 to $105,000" | income_temp=="$105,001 to $110,000" | income_temp=="$110,001 to $115,000"| ///
income_temp=="$115,001 to $120,000"| income_temp=="$120,001 to $125,000" 
replace inc=6 if income_temp=="$125,001 to $130,000"| income_temp=="$130,001 to $135,000" | income_temp=="$135,001 to $140,000"| ///
income_temp=="$140,001 to $145,000" | income_temp=="$145,001 or more" 

********************************************************************************
*                           clean demographics                                 *
********************************************************************************

// replace implausible zeros with missing values
replace vintage = . if vintage==0
replace building_size = . if building_size==0
replace uni_size = . if uni_size==0

********************************************************************************
*                          label key variables                                 *
********************************************************************************
lab var opower 			"Opower"
lab var opower_paper_wave "Opower wave"
lab var tot_kwh 		"Monthly electricity consumption (kWh)"
lab var income 			"Income (\\\$)"
lab var member 			"Number of household members"
lab var building_size	"Building size (ft$^2$)"
lab var uni_size 		"Unit size (ft$^2$)"
lab var vintage 		"House Year Built" 
lab var married 		"Married"

********************************************************************************
*                              save data                                       *
********************************************************************************
save .\intermediate_data\elec_2014-2018_mainfile.dta, replace


********************************************************************************
*                save data with wave 3, 6, 7 without last month                *
********************************************************************************
drop if opower_paper_wave==1 | opower_paper_wave==2 // insufficient pre-treatment consumption data
drop if opower_paper_wave==8 // insufficient post-treatment consumption data
drop if opower==. // drop non-participants
drop if opower_paper_wave==4 // drop new movers wave
tab opower_paper_wave
tab month
drop if month==53
save .\intermediate_data\elec_2014-2018_wave367.dta, replace


********************************************************************************
* generate pre-treatment consumption as mean of Jan and Feb consumption prior to treatment
********************************************************************************
use .\intermediate_data\elec_2014-2018_wave367.dta, clear

* generate post treatment average
bysort ky_ba: egen post_avg=mean(tot_kwh) if month_since_opower>=0  & month_since_opower<12
bysort ky_ba (month): replace post_avg=post_avg[_n-1] if post_avg==.
bysort ky_ba (month): replace post_avg=post_avg[_N] if post_avg==.
lab var post_avg "12-month post-treatment consumption (kWh)"

* generate Jan and Feb pre-treatment average
bysort ky_ba: egen pre_avg_winter=mean(tot_kwh) if month_since_opower<0 & (month_in_year==1 | month_in_year==2)
bysort ky_ba (month): replace pre_avg_winter=pre_avg_winter[_n-1] if pre_avg_winter==.
bysort ky_ba (month): replace pre_avg_winter=pre_avg_winter[_N] if pre_avg_winter==.
lab var pre_avg_winter	"Winter pre-treatment consumption (kWh)"

* generate most recent Jan and Feb pre-treatment average 
bysort ky_ba: egen pre_avg_winter_recent=mean(tot_kwh) if inrange(month_since_opower, -12, -1) & (month_in_year==1 | month_in_year==2)
bysort ky_ba (month): replace pre_avg_winter_recent=pre_avg_winter_recent[_n-1] if pre_avg_winter_recent==.
bysort ky_ba (month): replace pre_avg_winter_recent=pre_avg_winter_recent[_N] if pre_avg_winter_recent==.
lab var pre_avg_winter_recent	"Most recent winter pre-treatment consumption (kWh)"

save .\intermediate_data\elec_wave367_winterpre.dta, replace

export delimited using .\intermediate_data\elec_wave367_winterpre.csv, replace

* remove duplicates for each account 
bysort ky_ba: gen dup=_n
drop if dup>1

* remove other unecessary variables 
drop rate rate_description elecratetype days_between temp dup

save .\intermediate_data\elec_wave367_cross_winterpre.dta, replace

export delimited using .\intermediate_data\elec_wave367_cross_winterpre.csv, replace


********************************************************************************
*             Outliers and censoring
********************************************************************************
use .\intermediate_data\elec_wave367_winterpre.dta, replace

* drop accounts with missing or zero covariates
drop if pre_avg_12==. | income==. | building_size==. | uni_size==. | member==. | vintage==. | post_avg==. 
drop if pre_avg_12==0 | income==0 | building_size==0 | uni_size==0 | member==0 | vintage==0 | post_avg==0 

* censor outliers 
replace uni_size=5000 if uni_size>5000
replace vintage=1850 if vintage<1850

save .\intermediate_data\elec_wave367_winterpre.dta, replace

* remove duplicates for each account 
bysort ky_ba: gen dup=_n
drop if dup>1

* remove other unecessary variables 
keep ky_ba opower opower_paper_wave income building_size uni_size member vintage pre_avg_12 post_avg pre_avg_winter pre_avg_winter_recent 

save .\intermediate_data\elec_wave367_cross_winterpre.dta, replace


********************************************************************************
*         Create sample with no missing pre-treatment consumption data only
********************************************************************************
use .\intermediate_data\elec_2014-2018_wave367.dta, clear

* generate post treatment average
bysort ky_ba: egen post_avg=mean(tot_kwh) if month_since_opower>=0  & month_since_opower<12
bysort ky_ba (month): replace post_avg=post_avg[_n-1] if post_avg==.
bysort ky_ba (month): replace post_avg=post_avg[_N] if post_avg==.
lab var post_avg "12-month post-treatment consumption (kWh)"

* generate Jan and Feb pre-treatment average
bysort ky_ba: egen pre_avg_winter=mean(tot_kwh) if month_since_opower<0 & (month_in_year==1 | month_in_year==2)
bysort ky_ba (month): replace pre_avg_winter=pre_avg_winter[_n-1] if pre_avg_winter==.
bysort ky_ba (month): replace pre_avg_winter=pre_avg_winter[_N] if pre_avg_winter==.
lab var pre_avg_winter	"Winter pre-treatment consumption (kWh)"

* generate most recent Jan and Feb pre-treatment average 
bysort ky_ba: egen pre_avg_winter_recent=mean(tot_kwh) if inrange(month_since_opower, -12, -1) & (month_in_year==1 | month_in_year==2)
bysort ky_ba (month): replace pre_avg_winter_recent=pre_avg_winter_recent[_n-1] if pre_avg_winter_recent==.
bysort ky_ba (month): replace pre_avg_winter_recent=pre_avg_winter_recent[_N] if pre_avg_winter_recent==.
lab var pre_avg_winter_recent	"Most recent winter pre-treatment consumption (kWh)"

* drop accounts with missing or zero pre_avg_12 or post_avg
drop if pre_avg_12==. | post_avg==. 
drop if pre_avg_12==0 | post_avg==0 

save .\intermediate_data\elec_wave367_winterpre_nomissing_baseline.dta, replace
export delimited using .\intermediate_data\elec_wave367_winterpre_nomissing_baseline.csv, replace

* remove duplicates for each account 
bysort ky_ba: gen dup=_n
drop if dup>1

* remove other unecessary variables 
keep ky_ba opower opower_paper_wave income building_size uni_size member vintage pre_avg_12 post_avg pre_avg_winter pre_avg_winter_recent 

save .\intermediate_data\elec_wave367_cross_winterpre_nomissing_baseline.dta, replace
export delimited using .\intermediate_data\elec_wave367_cross_winterpre_nomissing_baseline.csv, replace

********************************************************************************
capture graph close
capture log close
exit
